Trends in Data Science & Business Analytics
  • Home
  • Data Cleaning & Exploration
    • Data Cleaning
    • Exploratory Data Analysis
    • Skill Gap Analysis
  • Machine Learning Methods
    • Supervised Machine Learning
    • Unsupervised Machine Learning

Exploratory Data Analytics

Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")
Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()
DATA_ANALYST_JOB
False    38212
True     33042
Name: count, dtype: int64
Code
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_grouped = (
    eda
    .groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
    .size()
    .reset_index(name='Job_Count')
)

short_names = {
    'Professional, Scientific, and Technical Services': 'Tech. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})

pivot = (
    df_grouped
    .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
    .reset_index()
)
industries = pivot['Industry'].tolist()
y_true  = pivot['True'].tolist()
y_false = pivot['False'].tolist()


# 2) Build a 2-row subplot: bar on top, table below

fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.70, 0.30],           # give a bit more room to the table
    specs=[[{"type":"bar"}],[{"type":"table"}]],
    vertical_spacing=0.12              # more space between bar and table
)

colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}

fig.add_trace(
    go.Bar(
        x=industries, y=y_true, name='True',
        marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
        text=y_true, textposition='outside'
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=industries, y=y_false, name='False',
        marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
        text=y_false, textposition='outside'
    ),
    row=1, col=1
)



# 3) Slider steps: 0 → 8 000 in 200s

steps = []
for val in range(0, 8001, 200):
    steps.append(dict(
        label=str(val),
        method="update",
        args=[
            {"y": [
                [v if v>=val else 0 for v in y_true],
                [v if v>=val else 0 for v in y_false]
            ]}
        ]
    ))


# 4) Final layout tweaks

fig.update_layout(
    # lift slider above everything
    sliders=[dict(
        active=0,
        currentvalue={"prefix":"Min Jobs: "},
        pad={"b":0},
        x=0.05,
        y=1.05,                # move slider way above the plot area
        xanchor="left",
        yanchor="bottom",
        len=0.7,
        font=dict(color='#A81D1D'),
        steps=steps
    )],

    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=24, color='#A81D1D'),
        x=0.5,
        y=0.95,                # drop the title just below the slider
        xanchor="center",
        yanchor="top"
    ),

    width=1100, height=850,
    margin=dict(l=60, r=60, t=180, b=200),  # extra top & bottom margin

    plot_bgcolor='white',
    paper_bgcolor='white',

    xaxis=dict(
        title="Industry",
        title_font=dict(size=16, color='#A81D1D'),
        tickmode='array',
        tickvals=list(range(len(industries))),
        ticktext=industries,
        tickangle=-30,
        tickfont=dict(size=11, color='#333'),
        showline=True, linecolor='#A81D1D'
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16, color='#A81D1D'),
        tickfont=dict(size=11, color='#333'),
        gridcolor='rgba(200,200,200,0.3)',
        showline=True, linecolor='#A81D1D',
        range=[0, max(max(y_true),max(y_false))*1.2]
    ),

    legend=dict(
        title="Data Analyst Job",
        title_font=dict(color='#A81D1D'),
        font=dict(size=12),
        x=0.95, y=0.95
    ),

    bargap=0.2
)

fig.write_html(
    "figures/edaplot1.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',')  # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()

# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')

# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]

# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)

# Step 6: Create the bar plot
fig = px.bar(
    df_skills_top,
    x='Count',
    y='SPECIALIZED_SKILLS_NAME',
    color='Job_Category',
    barmode='stack',
    color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
    title='Top 10 Specialized Skills by Job Category'
)

# Step 7: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.99,
        yanchor='top'
    ),
    xaxis=dict(
        title='Number of Jobs',
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='#E2E8F0',
        linecolor='#2D3748',
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),             
    yaxis=dict(
        title='Specialized Skill',
        title_font=dict(size=16),
        tickfont=dict(size=12)
    ),
    legend=dict(
        title='Job Category',
        font=dict(size=13),
        bgcolor='#FFFFFF',
        bordercolor='#FF6B6B',
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle'
    )
)


# Save to HTML
fig.write_html(
    'figures/edaplot2.html',
    include_plotlyjs='cdn',
    full_html=False
)
Number of rows after cleaning: 71254
Top 10 specialized skills:
               SPECIALIZED_SKILLS_NAME       Job_Category  Count
2071           "Business Intelligence"      Analytics Job   8077
2072           "Business Intelligence"  Non-Analytics Job   1778
2167                "Business Process"      Analytics Job   4417
2168                "Business Process"  Non-Analytics Job   8385
2180           "Business Requirements"      Analytics Job   4972
2181           "Business Requirements"  Non-Analytics Job   7740
4151                       "Dashboard"      Analytics Job   9975
4152                       "Dashboard"  Non-Analytics Job   1559
4173                   "Data Analysis"      Analytics Job  25620
6640                         "Finance"      Analytics Job   5573
6641                         "Finance"  Non-Analytics Job   6220
13332             "Project Management"      Analytics Job   6236
13333             "Project Management"  Non-Analytics Job   6939
13594  "Python (Programming Language)"      Analytics Job  10127
13595  "Python (Programming Language)"  Non-Analytics Job   1703
14459               "SAP Applications"      Analytics Job   2174
14460               "SAP Applications"  Non-Analytics Job   9149
14686     "SQL (Programming Language)"      Analytics Job  16067
14687     "SQL (Programming Language)"  Non-Analytics Job   4202
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])

# Create the scatter plot with trend line
fig = px.scatter(df, 
                 x='Avg_Years_Experience', 
                 y='SALARY', 
                 color='Job_Category',
                 trendline='ols',  # Add trend line (ordinary least squares)
                 title='Experience Requirements vs Salary for Analytics Jobs',
                 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
                 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Average Years of Experience",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Customize scatter points
fig.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(width=1, color="#2D3748")
    )
)


fig.write_html(
    "figures/edaplot3.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Clean the data (remove rows with missing STATE_NAME)
df = df.dropna(subset=['STATE_NAME'])

# Debug: Check the number of rows after cleaning and unique states
print("Number of rows after cleaning:", len(df))
print("Unique states extracted:", df['STATE_NAME'].unique())

# Step 2: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 3: Aggregate data by state and job category
df_state_counts = df.groupby(['STATE_NAME', 'Job_Category']).size().reset_index(name='Job_Count')

# Step 4: Pivot the data to get counts for Analytics and Non-Analytics Jobs
df_pivot = df_state_counts.pivot(index='STATE_NAME', columns='Job_Category', values='Job_Count').fillna(0)
df_pivot['Total_Jobs'] = df_pivot.get('Analytics Job', 0) + df_pivot.get('Non-Analytics Job', 0)
df_pivot = df_pivot.reset_index()

# Debug: Check the aggregated data
print("Aggregated data by state:")
print(df_pivot)

# Step 5: Find states with minimum and maximum jobs
min_jobs_row = df_pivot.loc[df_pivot['Total_Jobs'].idxmin()]
max_jobs_row = df_pivot.loc[df_pivot['Total_Jobs'].idxmax()]
min_jobs_state = min_jobs_row['STATE_NAME']
max_jobs_state = max_jobs_row['STATE_NAME']
min_jobs_count = min_jobs_row['Total_Jobs']
max_jobs_count = max_jobs_row['Total_Jobs']

# Step 6: Map state names to state codes for Plotly choropleth
state_name_to_code = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

df_pivot['State_Code'] = df_pivot['STATE_NAME'].map(state_name_to_code)

# Step 7: Clean the data (remove rows with unmapped states)
df_pivot = df_pivot.dropna(subset=['State_Code'])

# Debug: Check the final data before plotting
print("Final data for plotting:")
print(df_pivot[['STATE_NAME', 'State_Code', 'Total_Jobs']])

# Step 8: Define the color scale range and ticks
min_range = 100
max_range = 8100
increment = 900  # To get 10 divisions between 100 and 8100
tickvals = list(range(min_range, max_range + increment, increment))
tickvals = tickvals[:10]  # Ensure 8-10 divisions
ticktext = [str(val) for val in tickvals]

# Debug: Print the tick values for the color bar
print("Color bar tick values:", tickvals)

# Step 9: Create the choropleth map with a custom linear color scale
fig = px.choropleth(
    df_pivot,
    locations='State_Code',
    locationmode='USA-states',
    color='Total_Jobs',
    color_continuous_scale='Reds',  # Use a red gradient for impact
    scope='usa',
    range_color=[min_range, max_range],  # Explicitly set the range
    title='Geographic Distribution of Analytics Job Postings (2025)',
    hover_data=['STATE_NAME', 'Analytics Job', 'Non-Analytics Job', 'Total_Jobs'],
    labels={'Total_Jobs': 'Number of Jobs'}
)

# Step 10: Update layout for a stunning visualization
fig.update_layout(
    width=1000,
    height=700,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        text='Geographic Distribution of Analytics Job Postings (2025)',
        font=dict(size=28, color='#FF6B6B', family='Inter, sans-serif'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    geo=dict(
        bgcolor='white',
        lakecolor='white',
        landcolor='lightgray',
        subunitcolor='black',
        showlakes=True,
        showsubunits=True,
        showframe=True,
        framecolor='#2D3748',
        framewidth=2
    ),
    coloraxis_colorbar=dict(
        title='Number of Jobs',
        title_font=dict(size=16, family='Inter, sans-serif', color='#FF6B6B'),
        tickfont=dict(size=12, family='Inter, sans-serif', color='#2D3748'),
        tickvals=tickvals,
        ticktext=ticktext,
        len=0.8,
        thickness=20,
        outlinecolor='#2D3748',
        outlinewidth=1,
        bgcolor='rgba(255,255,255,0.8)'
    ),
    margin=dict(l=50, r=50, t=100, b=50)
)

# Step 11: Add annotations for Wyoming and Texas
fig.add_annotation(
    x=0.05,
    y=0.05,
    xref="paper",
    yref="paper",
    text=f"Lowest: Wyoming ({min_jobs_count} jobs)",
    showarrow=False,
    font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='#FF6B6B',
    borderwidth=1
)

fig.add_annotation(
    x=0.95,
    y=0.05,
    xref="paper",
    yref="paper",
    text=f"Highest: Texas ({max_jobs_count} jobs)",
    showarrow=False,
    font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='#FF6B6B',
    borderwidth=1,
    xanchor='right'
)

# Step 12: Save to HTML
fig.write_html(
    'figures/edaplot4.html',
    include_plotlyjs='cdn',
    full_html=False
)
Number of rows after cleaning: 71254
Unique states extracted: ['Arkansas' 'Maine' 'Texas' 'Arizona' 'California' 'Ohio' 'New Jersey'
 'New York' 'Hawaii' 'Georgia' 'Michigan' 'Mississippi' 'Massachusetts'
 'Alaska' 'Alabama' 'Indiana' 'Virginia' 'South Carolina' 'Colorado'
 'Nevada' 'Minnesota' 'Oregon' 'Oklahoma' 'North Carolina' 'Florida'
 'Washington' 'Delaware' 'Illinois' 'Pennsylvania' 'Kansas' 'Tennessee'
 'Washington, D.C. (District of Columbia)' 'Maryland' 'Idaho' 'Louisiana'
 'Connecticut' 'Nebraska' 'Missouri' 'North Dakota' 'Utah' 'New Hampshire'
 'Wisconsin' 'Kentucky' 'Rhode Island' 'Iowa' 'South Dakota' 'Montana'
 'New Mexico' 'Wyoming' 'West Virginia' 'Vermont']
Aggregated data by state:
Job_Category                               STATE_NAME  Analytics Job  \
0                                             Alabama            376   
1                                              Alaska            133   
2                                             Arizona            673   
3                                            Arkansas            252   
4                                          California           3239   
5                                            Colorado            646   
6                                         Connecticut            390   
7                                            Delaware            219   
8                                             Florida           1526   
9                                             Georgia           1144   
10                                             Hawaii            140   
11                                              Idaho            214   
12                                           Illinois           1595   
13                                            Indiana            442   
14                                               Iowa            285   
15                                             Kansas            330   
16                                           Kentucky            279   
17                                          Louisiana            230   
18                                              Maine            193   
19                                           Maryland            791   
20                                      Massachusetts            987   
21                                           Michigan            709   
22                                          Minnesota            709   
23                                        Mississippi            267   
24                                           Missouri            587   
25                                            Montana            105   
26                                           Nebraska            221   
27                                             Nevada            237   
28                                      New Hampshire            134   
29                                         New Jersey           1022   
30                                         New Mexico            163   
31                                           New York           1752   
32                                     North Carolina           1284   
33                                       North Dakota             96   
34                                               Ohio           1149   
35                                           Oklahoma            280   
36                                             Oregon            468   
37                                       Pennsylvania            962   
38                                       Rhode Island            253   
39                                     South Carolina            333   
40                                       South Dakota            149   
41                                          Tennessee            599   
42                                              Texas           2967   
43                                               Utah            350   
44                                            Vermont            118   
45                                           Virginia           1906   
46                                         Washington            856   
47            Washington, D.C. (District of Columbia)            668   
48                                      West Virginia             76   
49                                          Wisconsin            477   
50                                            Wyoming             61   

Job_Category  Non-Analytics Job  Total_Jobs  
0                           282         658  
1                           101         234  
2                           932        1605  
3                           299         551  
4                          3813        7052  
5                           786        1432  
6                           447         837  
7                           214         433  
8                          2078        3604  
9                          1481        2625  
10                           97         237  
11                          232         446  
12                         1912        3507  
13                          487         929  
14                          307         592  
15                          382         712  
16                          325         604  
17                          198         428  
18                          149         342  
19                          548        1339  
20                         1027        2014  
21                         1097        1806  
22                          740        1449  
23                          172         439  
24                          611        1198  
25                           78         183  
26                          291         512  
27                          319         556  
28                          133         267  
29                         1569        2591  
30                           91         254  
31                         1579        3331  
32                         1426        2710  
33                           50         146  
34                         1442        2591  
35                          298         578  
36                          593        1061  
37                         1284        2246  
38                          176         429  
39                          288         621  
40                          139         288  
41                          643        1242  
42                         5083        8050  
43                          259         609  
44                          108         226  
45                         1685        3591  
46                          744        1600  
47                          551        1219  
48                           80         156  
49                          544        1021  
50                           42         103  
Final data for plotting:
Job_Category      STATE_NAME State_Code  Total_Jobs
0                    Alabama         AL         658
1                     Alaska         AK         234
2                    Arizona         AZ        1605
3                   Arkansas         AR         551
4                 California         CA        7052
5                   Colorado         CO        1432
6                Connecticut         CT         837
7                   Delaware         DE         433
8                    Florida         FL        3604
9                    Georgia         GA        2625
10                    Hawaii         HI         237
11                     Idaho         ID         446
12                  Illinois         IL        3507
13                   Indiana         IN         929
14                      Iowa         IA         592
15                    Kansas         KS         712
16                  Kentucky         KY         604
17                 Louisiana         LA         428
18                     Maine         ME         342
19                  Maryland         MD        1339
20             Massachusetts         MA        2014
21                  Michigan         MI        1806
22                 Minnesota         MN        1449
23               Mississippi         MS         439
24                  Missouri         MO        1198
25                   Montana         MT         183
26                  Nebraska         NE         512
27                    Nevada         NV         556
28             New Hampshire         NH         267
29                New Jersey         NJ        2591
30                New Mexico         NM         254
31                  New York         NY        3331
32            North Carolina         NC        2710
33              North Dakota         ND         146
34                      Ohio         OH        2591
35                  Oklahoma         OK         578
36                    Oregon         OR        1061
37              Pennsylvania         PA        2246
38              Rhode Island         RI         429
39            South Carolina         SC         621
40              South Dakota         SD         288
41                 Tennessee         TN        1242
42                     Texas         TX        8050
43                      Utah         UT         609
44                   Vermont         VT         226
45                  Virginia         VA        3591
46                Washington         WA        1600
48             West Virginia         WV         156
49                 Wisconsin         WI        1021
50                   Wyoming         WY         103
Color bar tick values: [100, 1000, 1900, 2800, 3700, 4600, 5500, 6400, 7300, 8200]
Code
import plotly.graph_objects as go
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Clean the data (remove rows with missing values for key columns)
df = df.dropna(subset=['MIN_EDULEVELS_NAME', 'NAICS2_NAME'])

# Step 2: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 3: Shorten industry names for better display
short_names = {
    'Professional, Scientific, and Technical Services': 'Tech. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df['Industry'] = df['NAICS2_NAME'].map(short_names).fillna(df['NAICS2_NAME'])

# Step 4: Get the top 5 industries by total job count to keep the diagram manageable
total_jobs_by_industry = df.groupby('Industry').size().nlargest(5)
top_industries = total_jobs_by_industry.index.tolist()
df = df[df['Industry'].isin(top_industries)]

# Step 5: Aggregate data to get flows
# Flow from Education Level to Job Category
edu_to_job = df.groupby(['MIN_EDULEVELS_NAME', 'Job_Category']).size().reset_index(name='Count')

# Flow from Job Category to Industry
job_to_industry = df.groupby(['Job_Category', 'Industry']).size().reset_index(name='Count')

# Debug: Check the aggregated data
print("Flow from Education Level to Job Category:")
print(edu_to_job)
print("Flow from Job Category to Industry:")
print(job_to_industry)

# Step 6: Create nodes and links for the Sankey diagram
# Nodes: Education Levels + Job Categories + Industries
edu_levels = list(df['MIN_EDULEVELS_NAME'].unique())
job_categories = list(df['Job_Category'].unique())
industries = list(df['Industry'].unique())

# Create a list of all nodes
all_nodes = edu_levels + job_categories + industries
node_indices = {node: idx for idx, node in enumerate(all_nodes)}

# Links: Education Level -> Job Category
links_source = []
links_target = []
links_value = []
for _, row in edu_to_job.iterrows():
    source = node_indices[row['MIN_EDULEVELS_NAME']]
    target = node_indices[row['Job_Category']]
    value = row['Count']
    links_source.append(source)
    links_target.append(target)
    links_value.append(value)

# Links: Job Category -> Industry
for _, row in job_to_industry.iterrows():
    source = node_indices[row['Job_Category']]
    target = node_indices[row['Industry']]
    value = row['Count']
    links_source.append(source)
    links_target.append(target)
    links_value.append(value)

# Satep 7: Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='#2D3748', width=0.5),
        label=all_nodes,
        color=['#FF6B6B' if 'Job' in node else '#FFE5E5' for node in all_nodes]  # Red for job categories, light red for others
    ),
    link=dict(
        source=links_source,
        target=links_target,
        value=links_value,
        color=['#FF6B6B' if all_nodes[source] in job_categories else '#FFE5E5' for source in links_source]  # Red links for job category flows
    )
)])

# Step 8: Update layout for a stunning visualization
fig.update_layout(
    title=dict(
        text='Flow of Jobs: Education Level → Job Category → Industry (2025)',
        font=dict(size=28, color='#FF6B6B', family='Inter, sans-serif'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    width=1000,
    height=600,
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(l=50, r=50, t=100, b=50)
)

# Step 9: Add annotations for context
fig.add_annotation(
    x=0.05,
    y=0.05,
    xref="paper",
    yref="paper",
    text="Flow represents job counts across categories",
    showarrow=False,
    font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='#FF6B6B',
    borderwidth=1
)

# Step 10: Save to HTML
fig.write_html(
    'figures/edaplot5.html',
    include_plotlyjs='cdn',
    full_html=False
)
Flow from Education Level to Job Category:
              MIN_EDULEVELS_NAME       Job_Category  Count
0               Associate degree      Analytics Job    495
1               Associate degree  Non-Analytics Job   1618
2              Bachelor's degree      Analytics Job  14718
3              Bachelor's degree  Non-Analytics Job  14576
4             High school or GED      Analytics Job    919
5             High school or GED  Non-Analytics Job   1510
6                Master's degree      Analytics Job    564
7                Master's degree  Non-Analytics Job    572
8            No Education Listed      Analytics Job   5955
9            No Education Listed  Non-Analytics Job  11856
10  Ph.D. or professional degree      Analytics Job     22
11  Ph.D. or professional degree  Non-Analytics Job     27
Flow from Job Category to Industry:
        Job_Category               Industry  Count
0      Analytics Job     Admin & Waste Mgmt   4105
1      Analytics Job                Finance   4246
2      Analytics Job          Manufacturing   2554
3      Analytics Job         Tech. Services   7620
4      Analytics Job  Unclassified Industry   4148
5  Non-Analytics Job     Admin & Waste Mgmt   4296
6  Non-Analytics Job                Finance   2886
7  Non-Analytics Job          Manufacturing   2169
8  Non-Analytics Job         Tech. Services  15552
9  Non-Analytics Job  Unclassified Industry   5256